先把程式碼函式化了!不過好像沒什麼用...
#! /usr/bin/env python
#-*- coding: utf-8 -*-
import urllib.request
import re
def get_attr( TagList, targetList, hostUrl ):
siteMap = []
for y in TagList :
#print( y[1] )
if y[0] in targetList : #尋找<a>標籤
Attr = y[1].split(" ")
for z in Attr:
IgCh = [ '"' , "'" ] #去除前後分號
if re.search( r'^href=' , z ):
if z[5] in IgCh: #判斷是否有分號
if z[6:-1] not in siteMap: #判斷是否重複
rehostUrl = "^" + hostUrl #判斷是否為同一個Domain #尚未判斷相對路徑
if re.search( hostUrl , z[6:-1] ):
siteMap.append( z[6:-1] )
else:
if z[5:] not in siteMap:
siteMap.append( z[5:] )
return siteMap
def get_sitmap( TagList, hostUrl ):
targetList = [ "a" ]
siteMap = []
siteMap.append( get_attr( TagList, targetList, hostUrl ) )
return siteMap
def get_tag( data ):
reStr = "<([\w]+) (.*?)>";
reObj = re.compile( reStr.format() )
targetTag = reObj.findall( str( data.decode( "utf-8" ) ) )
TagList = []
for x in targetTag:
TagList.append( x )
return TagList
def get_url_data( targetUrl ):
urlobj = urllib.request.urlopen( targetUrl )
data = urlobj.read()
hostUrl = urlobj.geturl()
urlobj.close()
return data, hostUrl
def test():
targetUrl = "http://192.168.1.8/"
data, hostUrl = get_url_data( targetUrl )
TagList = get_tag( data )
sitemap = get_sitmap( TagList, hostUrl )
print( sitemap )
if __name__ == "__main__":
test()
明天繼續努力!以上!晚安!